# Load the dataset
df <- read.csv("/Users/naiyueliang/Desktop/BGSU Study/Statistical Learning 2/engineered data/train_engineered.csv")
# 1. Print basic structure info
cat("Total rows (observations):", nrow(df), "\n")
## Total rows (observations): 11734
cat("Total columns (features):", ncol(df), "\n")
## Total columns (features): 87
# 2. Check for missing values
if (all(colSums(is.na(df)) == 0)) {
cat("No missing values (each column has", nrow(df), "non-null values)\n")
} else {
cat("⚠️ Missing values detected in some columns\n")
}
## No missing values (each column has 11734 non-null values)
# 3. Data type summary
type_counts <- sapply(df, class)
type_summary <- table(type_counts)
cat("\n Data Type Breakdown:\n\n")
##
## Data Type Breakdown:
for (t in names(type_summary)) {
count <- type_summary[[t]]
example_vars <- names(type_counts[type_counts == t])[1:min(3, count)]
cat(sprintf("- %s: %d columns, e.g., %s\n",
t,
count,
paste(example_vars, collapse = ", ")
))
}
## - character: 29 columns, e.g., Student_IDs, Semester, Degree_Type
## - integer: 24 columns, e.g., Course_Code_by_Thousands, Semester_Week, Duration_In_Min
## - logical: 4 columns, e.g., Is_Weekend, Has_Multiple_Majors, Has_Minor
## - numeric: 30 columns, e.g., Term_GPA, Total_Credit_Hours_Earned, Cumulative_GPA
The dataset contains 11,734 observations and 87 features, with no
missing values, indicating good data quality. Variable types include:
Character (29), Integer (24), Logical (4), Numeric (30).
library(ggplot2)
# Duration_In_Min
ggplot(df, aes(x = Duration_In_Min)) +
geom_histogram(aes(y = ..density..), bins = 40, fill = "skyblue", color = "white", alpha = 0.8) +
geom_density(color = "darkblue", size = 1) +
labs(title = "Distribution of Duration_In_Min", x = "Duration (minutes)", y = "Density") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
summary(df$Duration_In_Min)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.00 45.00 68.00 82.23 104.00 822.00
IQR(df$Duration_In_Min)
## [1] 59
# Occupancy
ggplot(df, aes(x = Occupancy)) +
geom_histogram(aes(y = ..density..), bins = 40, fill = "salmon", color = "white", alpha = 0.8) +
geom_density(color = "darkred", size = 1) +
labs(title = "Distribution of Occupancy", x = "Occupancy Count", y = "Density") +
theme_minimal()
summary(df$Occupancy)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 7.00 11.00 11.62 15.00 40.00
IQR(df$Occupancy)
## [1] 8
1. Duration (in minutes)
The distribution of Duration_In_Min is right-skewed, with most sessions
lasting between 45 and 104 minutes (the interquartile range).
The mean duration is about 82 minutes, but a few long sessions (up to
822 minutes) pull the mean higher than the median (68 minutes).
This skew suggests the presence of outliers or occasional extended
sessions.
2. Occupancy (number of students)
Occupancy is more symmetrically distributed, though still slightly
right-skewed. The typical number of attendees ranges from 7 to 15, with
a mean of ~11.6 and a median of 11. The maximum observed occupancy is
40, but most sessions are much smaller.
# unique ID count
length(unique(df$Student_IDs))
## [1] 1943
# unique Course_Name count
length(unique(df$Course_Name))
## [1] 308
# unique Course_Number count
length(unique(df$Course_Number))
## [1] 336
The dataset includes a total of 1,943 unique students, taking 308
distinct course names across 336 different course numbers.
# Major
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
# 1. Frequency table
table(df$Major) %>% sort(decreasing = TRUE) %>% head(10)
##
## No Response
## 5580
## MKT-BSBA:Marketing
## 271
## EIEC-BSED:Inclusive Early Childhood
## 229
## ACCT-BSBA:Accounting
## 225
## SUPC-BSBA:Supply Chain Management
## 191
## CONM-BSCM:Construction Mgmt & Technology
## 188
## GBUS-MIN:General Business,SPMGT-BSED:Sport Management
## 178
## BIOL-BS:Biology,SCIENC-MIN:Science
## 124
## HLTHBSDIET:BS in Dietetics
## 119
## MIS-BSBA:Information Systems
## 96
# 2. Bar plot of top 10 categories
df %>%
count(Major) %>%
top_n(10, n) %>%
ggplot(aes(x = reorder(Major, n), y = n)) +
geom_bar(stat = "identity", fill = "skyblue") +
coord_flip() +
labs(title = "Top 10 Majors by Frequency", x = "Major", y = "Count") +
theme_minimal()
# 3. Mean Duration by category
df %>%
group_by(Major) %>%
summarise(mean_duration = mean(Duration_In_Min, na.rm = TRUE),
n = n()) %>%
filter(n > 30) %>%
top_n(10, mean_duration) %>%
ggplot(aes(x = reorder(Major, mean_duration), y = mean_duration)) +
geom_col(fill = "salmon") +
coord_flip() +
labs(title = "Top 10 Majors by Mean Duration", x = "Major", y = "Avg. Duration (min)") +
theme_minimal()
# 3. Mean Occupancy by category
df %>%
group_by(Major) %>%
summarise(mean_duration = mean(Occupancy, na.rm = TRUE),
n = n()) %>%
filter(n > 30) %>%
top_n(10, mean_duration) %>%
ggplot(aes(x = reorder(Major, mean_duration), y = mean_duration)) +
geom_col(fill = "salmon") +
coord_flip() +
labs(title = "Top 10 Majors by Mean Occupancy", x = "Major", y = "Avg. Occupancy") +
theme_minimal()
Among all students, over 5,000 entries have “No Response” for major. The
most common reported majors include Marketing, Early Childhood
Education, and Accounting. Majors like Sport Management and Supply Chain
Management have the longest average session durations, while Accounting
and Information Systems show higher average occupancy, indicating
possibly larger class sizes.
library(dplyr)
library(ggplot2)
# All the categorical predictors
cat_vars <- c("Class_Standing", "Course_Code_by_Thousands", "Semester")
# loop
for (var in cat_vars) {
cat("\n📊 Variable:", var, "\n")
print(table(df[[var]]) %>% sort(decreasing = TRUE))
# frequency
p1 <- df %>%
count(!!sym(var)) %>%
ggplot(aes(x = reorder(!!sym(var), n), y = n)) +
geom_col(fill = "skyblue") +
coord_flip() +
labs(title = paste("Frequency of", var), x = var, y = "Count") +
theme_minimal()
print(p1)
# Avg Duration in groups
p2 <- df %>%
group_by(!!sym(var)) %>%
summarise(avg_duration = mean(Duration_In_Min, na.rm = TRUE),
n = n()) %>%
filter(n > 30) %>%
ggplot(aes(x = reorder(!!sym(var), avg_duration), y = avg_duration)) +
geom_col(fill = "salmon") +
coord_flip() +
labs(title = paste("Avg. Duration by", var),
x = var, y = "Avg. Duration (min)") +
theme_minimal()
print(p2)
#Occupancy
# Avg Occupancy in groups
p3 <- df %>%
group_by(!!sym(var)) %>%
summarise(avg_duration = mean(Occupancy, na.rm = TRUE),
n = n()) %>%
filter(n > 30) %>%
ggplot(aes(x = reorder(!!sym(var), avg_duration), y = avg_duration)) +
geom_col(fill = "salmon") +
coord_flip() +
labs(title = paste("Avg. Occupancy by", var),
x = var, y = "Avg. Occupancy") +
theme_minimal()
print(p3)
}
##
## 📊 Variable: Class_Standing
##
## Senior Junior Graduate Freshman Sophomore Other
## 8932 1300 642 466 378 16
##
## 📊 Variable: Course_Code_by_Thousands
##
## 1000 2000 3000 99 4000 0 6000 100 5000 7000
## 7163 3012 1188 194 86 55 16 15 3 2
##
## 📊 Variable: Semester
##
## Fall 2016 Spring 2017
## 6482 5252
Seniors represent the largest group in the dataset, followed by Juniors
and Graduates. Interestingly, Juniors have the longest average session
durations, while Sophomores have the highest average occupancy.
Course codes in the 1000s and 2000s are the most frequent, but courses in the 4000s tend to have the longest durations. In terms of semesters, Fall 2016 had more sessions than Spring 2017, and also showed slightly higher average durations and occupancy.
# Course_Name
library(dplyr)
library(ggplot2)
# 1. Frequency table
table(df$Course_Name) %>% sort(decreasing = TRUE) %>% head(10)
##
## Basic Calculus
## 2629
## Introduction to Statistics
## 853
## Calculus and Analytic Geometry
## 589
## College Algebra
## 541
## Business Analytics III: Descriptive Analytics
## 477
## Predictive Analytics
## 406
## Precalculus Mathematics
## 385
## Principles of Microeconomics
## 355
## College Algebra I
## 283
## Mathematics for Architecture/Construct
## 231
# 2. Bar plot of top 10 categories
df %>%
count(Course_Name) %>%
top_n(10, n) %>%
ggplot(aes(x = reorder(Course_Name, n), y = n)) +
geom_bar(stat = "identity", fill = "skyblue") +
coord_flip() +
labs(title = "Top 10 Course_Name by Frequency", x = "Course_Name", y = "Count") +
theme_minimal()
# 3. Mean Duration by category
df %>%
group_by(Course_Name) %>%
summarise(mean_duration = mean(Duration_In_Min, na.rm = TRUE),
n = n()) %>%
filter(n > 30) %>%
top_n(10, mean_duration) %>%
ggplot(aes(x = reorder(Course_Name, mean_duration), y = mean_duration)) +
geom_col(fill = "salmon") +
coord_flip() +
labs(title = "Top 10 Course_Name by Mean Duration", x = "Course_Name", y = "Avg. Duration (min)") +
theme_minimal()
# Occupancy
# 3. Mean Occupancy by category
df %>%
group_by(Course_Name) %>%
summarise(mean_duration = mean(Occupancy, na.rm = TRUE),
n = n()) %>%
filter(n > 30) %>%
top_n(10, mean_duration) %>%
ggplot(aes(x = reorder(Course_Name, mean_duration), y = mean_duration)) +
geom_col(fill = "salmon") +
coord_flip() +
labs(title = "Top 10 Course_Name by Mean Occupancy", x = "Course_Name", y = "Avg. Occupancy") +
theme_minimal()
Basic Calculus is by far the most frequently taken course, followed by
Introduction to Statistics and Calculus and Analytic Geometry. Courses
such as Business Finance and College Algebra I have the longest average
durations, while Organic Chemistry and Accounting-related courses tend
to have the highest average occupancy.
# Course_Type
library(dplyr)
library(ggplot2)
# 1. Frequency table
table(df$Course_Type) %>% sort(decreasing = TRUE) %>% head(10)
##
## MATH STAT CHEM ECON BIOL ACCT CS SPAN PSYC OR
## 6136 1006 908 675 321 244 241 189 186 162
# 2. Bar plot of top 10 categories
df %>%
count(Course_Type) %>%
top_n(10, n) %>%
ggplot(aes(x = reorder(Course_Type, n), y = n)) +
geom_bar(stat = "identity", fill = "skyblue") +
coord_flip() +
labs(title = "Top 10 Course_Type by Frequency", x = "Course_Type", y = "Count") +
theme_minimal()
# 3. Mean Duration by category
df %>%
group_by(Course_Type) %>%
summarise(mean_duration = mean(Duration_In_Min, na.rm = TRUE),
n = n()) %>%
filter(n > 30) %>%
top_n(10, mean_duration) %>%
ggplot(aes(x = reorder(Course_Type, mean_duration), y = mean_duration)) +
geom_col(fill = "salmon") +
coord_flip() +
labs(title = "Top 10 Course_Type by Mean Duration", x = "Course_Type", y = "Avg. Duration (min)") +
theme_minimal()
#Occupancy
# 3. Mean Duration by category
df %>%
group_by(Course_Type) %>%
summarise(mean_duration = mean(Occupancy, na.rm = TRUE),
n = n()) %>%
filter(n > 30) %>%
top_n(10, mean_duration) %>%
ggplot(aes(x = reorder(Course_Type, mean_duration), y = mean_duration)) +
geom_col(fill = "salmon") +
coord_flip() +
labs(title = "Top 10 Course_Type by Mean Occupancy", x = "Course_Type", y = "Avg. Occupancy") +
theme_minimal()
Math (MATH) is the most frequent course type by far, followed by
Statistics (STAT) and Chemistry (CHEM). Courses under Finance (FIN) and
Operations Research (OR) have the longest average durations, while
German (GERM) and Accounting (ACCT) course types show the highest
average occupancy, indicating larger class sizes in those areas.
# Check_In_Hour
# Check_Out_Hour
# Check_In_Date
library(dplyr)
library(ggplot2)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
# 1. convert time
df <- df %>%
mutate(Check_In_Time = hms::as_hms(Check_In_Time),
Check_Out_Time = hms::as_hms(Check_Out_Time),
Check_In_Hour = hour(Check_In_Time),
Check_Out_Hour = hour(Check_Out_Time),
Check_In_Date = as.Date(Check_In_Date))
# 2. histogram Check_In_Hour
ggplot(df, aes(x = Check_In_Hour)) +
geom_histogram(binwidth = 1, fill = "skyblue", color = "white") +
labs(title = "Distribution of Check-In Time by Hour",
x = "Hour of Day", y = "Count") +
theme_minimal()
# 3. histogram:Check_Out_Hour
ggplot(df, aes(x = Check_Out_Hour)) +
geom_histogram(binwidth = 1, fill = "salmon", color = "white") +
labs(title = "Distribution of Check-Out Time by Hour",
x = "Hour of Day", y = "Count") +
theme_minimal()
# 4. Check-In over calendar date
df %>%
group_by(Check_In_Date) %>%
summarise(visits = n()) %>%
ggplot(aes(x = Check_In_Date, y = visits)) +
geom_line(color = "steelblue") +
labs(title = "Check-Ins over Time", x = "Date", y = "Count") +
theme_minimal()
library(ggplot2)
# Define predictors and response variables
predictors <- c("Check_In_Time", "Check_Out_Time", "Check_In_Date")
responses <- c("Duration_In_Min", "Occupancy")
# Create scatter plots
for (x in predictors) {
for (y in responses) {
p <- ggplot(df, aes_string(x = x, y = y)) +
geom_point(alpha = 0.4, color = "steelblue") +
labs(title = paste("Scatter Plot of", y, "vs", x),
x = x, y = y) +
theme_minimal()
print(p)
}
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Check-In/Out Time Distributions: Most check-ins occur between 10 AM and
4 PM, peaking around 3 PM, while check-outs are more spread out,
extending later into the evening (around 5–9 PM).
Daily Check-Ins: There’s a clear weekly pattern with dips on weekends
and a large gap during winter break.
Scatter Plots:
Duration vs. Check-In Time: Duration tends to decrease as check-in time
gets later, likely due to closing time constraints.
Occupancy vs. Check-In Time: Occupancy increases until the afternoon,
then gradually declines, aligning with peak usage hours.
Duration vs. Check-Out Time: Later check-outs are generally associated
with longer durations.
Occupancy vs. Check-Out Time: Occupancy peaks around mid-to-late
afternoon, consistent with peak facility use.
Duration/Occupancy vs. Check-In Date: Patterns reflect semester cycles,
with activity ramping up during academic periods and dropping off during
breaks.
library(ggplot2)
library(dplyr)
# countinuous predictors
cont_vars <- c("Total_Credit_Hours_Earned")
# plots
for (var in cont_vars) {
cat("\n📊 Now analyzing:", var, "\n")
# 1. histogram
p1 <- ggplot(df, aes_string(x = var)) +
geom_histogram(aes(y = ..density..), bins = 40, fill = "skyblue", color = "white", alpha = 0.7) +
geom_density(color = "darkblue", size = 1) +
labs(title = paste("Histogram + Density of", var), x = var, y = "Density") +
theme_minimal()
print(p1)
# 2. boxplot
p2 <- ggplot(df, aes_string(y = var)) +
geom_boxplot(fill = "salmon") +
labs(title = paste("Boxplot of", var), y = var) +
theme_minimal()
print(p2)
# 3. pridictor vs Duration_In_Min
if (var != "Duration_In_Min") {
p3 <- ggplot(df, aes_string(x = var, y = "Duration_In_Min")) +
geom_point(alpha = 0.4, color = "steelblue") +
geom_smooth(method = "loess", color = "red", se = FALSE) +
labs(title = paste(var, "vs. Duration_In_Min"), x = var, y = "Duration (Min)") +
theme_minimal()
print(p3)
}
# 4. pridictor vs Occupancy
if (var != "Occupancy") {
p3 <- ggplot(df, aes_string(x = var, y = "Occupancy")) +
geom_point(alpha = 0.4, color = "steelblue") +
geom_smooth(method = "loess", color = "red", se = FALSE) +
labs(title = paste(var, "vs. Occupancy"), x = var, y = "Occupancy") +
theme_minimal()
print(p3)
}
}
##
## 📊 Now analyzing: Total_Credit_Hours_Earned
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
The variable Total_Credit_Hours_Earned is right-skewed, with a peak
around 130–135 credit hours. The boxplot reveals some lower and upper
outliers. Its relationship with both Duration_In_Min and Occupancy
appears weak but slightly increasing, especially at higher credit hour
levels. The loess trend lines suggest that students with more credit
hours may stay a bit longer and have slightly higher occupancy rates,
but the effect is minimal.
## check the relationship between Duration_In_Min and Occupancy
library(ggplot2)
library(dplyr)
# countinuous predictors
cont_vars <- c( "Duration_In_Min", "Occupancy")
# plots
for (var in cont_vars) {
cat("\n📊 Now analyzing:", var, "\n")
# 3. pridictor vs Duration_In_Min
if (var != "Duration_In_Min") {
p3 <- ggplot(df, aes_string(x = var, y = "Duration_In_Min")) +
geom_point(alpha = 0.4, color = "steelblue") +
geom_smooth(method = "loess", color = "red", se = FALSE) +
labs(title = paste(var, "vs. Duration_In_Min"), x = var, y = "Duration (Min)") +
theme_minimal()
print(p3)
}
# 4. pridictor vs Occupancy
if (var != "Occupancy") {
p3 <- ggplot(df, aes_string(x = var, y = "Occupancy")) +
geom_point(alpha = 0.4, color = "steelblue") +
geom_smooth(method = "loess", color = "red", se = FALSE) +
labs(title = paste(var, "vs. Occupancy"), x = var, y = "Occupancy") +
theme_minimal()
print(p3)
}
}
##
## 📊 Now analyzing: Duration_In_Min
## `geom_smooth()` using formula = 'y ~ x'
##
## 📊 Now analyzing: Occupancy
## `geom_smooth()` using formula = 'y ~ x'
The scatter plots show that Duration_In_Min and Occupancy have a weak
nonlinear relationship.
As Duration_In_Min increases, Occupancy initially stays flat, then
slightly declines.
Conversely, higher Occupancy is associated with slightly longer
durations on average, especially beyond 25 occupants. Overall, the
correlation is minimal but suggests that longer durations tend to happen
with smaller groups, and larger groups may not stay as long.
# convert to factor
df$Course_Code_by_Thousands <- as.factor(df$Course_Code_by_Thousands)
# Boxplot vs. Duration
ggplot(df, aes(x = Course_Code_by_Thousands, y = Duration_In_Min)) +
geom_boxplot(fill = "skyblue") +
labs(title = "Duration by Course Code Group", x = "Course Code (Grouped)", y = "Duration (Min)") +
theme_minimal()
# Boxplot vs. Occupancy
ggplot(df, aes(x = Course_Code_by_Thousands, y = Occupancy)) +
geom_boxplot(fill = "salmon") +
labs(title = "Occupancy by Course Code Group", x = "Course Code (Grouped)", y = "Occupancy") +
theme_minimal()
Courses in the 1000–4000 code groups tend to have longer durations and
higher occupancy, with more extreme values and variability. In contrast,
courses in the 5000–7000 range show shorter durations and lower, more
consistent occupancy levels. This suggests lower-level courses are more
intensive and widely attended than upper-level ones.